Raw data

- Study summary: Neurological complications are common in patients with COVID-19. While SARS-CoV-2, the causal pathogen of COVID-19, has been detected in some patient brains, its ability to infect brain cells and impact their function are not well understood, and experimental models using human brain cells are urgently needed. Here we investigated the susceptibility of human induced pluripotent stem cell (hiPSC)-derived monolayer brain cells and region-specific brain organoids to SARS-CoV-2 infection. We found modest numbers of infected neurons and astrocytes, but greater infection of choroid plexus epithelial cells. We optimized a protocol to generate choroid plexus organoids from hiPSCs, which revealed productive SARS-CoV-2 infection that leads to increased cell death and transcriptional dysregulation indicative of an inflammatory response and cellular function deficits. Together, our results provide evidence for SARS-CoV-2 neurotropism and support use of hiPSC-derived brain organoids as a platform to investigate the cellular susceptibility, disease mechanisms, and treatment strategies for SARS-CoV-2 infection. Bulk RNA-seq of choroid plexus organoids (CPOs) was performed on mock 72 hours post-infection (hpi), SARS-CoV-2 24 hpi, and SARS-CoV-2 72 hpi samples. All conditions were profiled in triplicate.

Loading packages

library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(pheatmap)
library(gridExtra)
library(UpSetR)
library(ensembldb)
library(apeglm)
library(ashr)

Setting AnnotationHub

DB <- "EnsDb"                        # Set your DB of interest
AnnotationSpecies <- "Homo sapiens"  # Set your species 
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))  # Bring annotation DB

Running AnnotationHub

# Filter annotation of interest
ahQuery <- query(ah, 
                 pattern=c(DB, AnnotationSpecies), 
                 ignore.case=T)      


# Select the most recent data
DBName <- mcols(ahQuery) %>%
    rownames() %>%
    tail(1)

AnnoDb <- ah[[DBName]] 

# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="TXID")
# Note: Annotation has to be done with not genome but transcripts 
AnnoDb <- select(AnnoDb, 
                 AnnoKey,
                 keytype="TXID",
                 columns=c("GENEID", "GENENAME")) 


# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)    # The column 1 has to assign transcript (e.g. ENSEMBLTRANS)
##              TXID          GENEID GENENAME
## 1 ENST00000000233 ENSG00000004059     ARF5
## 2 ENST00000000412 ENSG00000003056     M6PR
## 3 ENST00000000442 ENSG00000173153    ESRRA
## 4 ENST00000001008 ENSG00000004478    FKBP4
## 5 ENST00000001146 ENSG00000003137  CYP26B1
## 6 ENST00000002125 ENSG00000003509  NDUFAF7

Defining file name and path for .sf files

.sf files have been created from fastq data by salmon

# This code chunk needs to be written by yourself 
#
# Define sample names 
SampleNames <-  c(paste0("Mock-rep", 1:3), paste0("SARS-CoV-2-rep", 1:3)) 

# Define group level
GroupLevel <- c("Mock", "Covid")

# Define contrast for DE analysis
Contrast <- c("Group", GroupLevel)


# Define a vector for comparing TPM vs Counts effect 
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC

# Define .sf file path
sf <- c(paste0("salmon_output/", 
               SampleNames,
               ".salmon_quant/quant.sf"))

# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))

# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
                       Group=factor(group, levels=GroupLevel),
                       Path=sf)

rownames(metadata) <- SampleNames

# Explore the metadata
print(metadata)
##                          Sample Group
## Mock-rep1             Mock-rep1  Mock
## Mock-rep2             Mock-rep2  Mock
## Mock-rep3             Mock-rep3  Mock
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid
##                                                                Path
## Mock-rep1             salmon_output/Mock-rep1.salmon_quant/quant.sf
## Mock-rep2             salmon_output/Mock-rep2.salmon_quant/quant.sf
## Mock-rep3             salmon_output/Mock-rep3.salmon_quant/quant.sf
## SARS-CoV-2-rep1 salmon_output/SARS-CoV-2-rep1.salmon_quant/quant.sf
## SARS-CoV-2-rep2 salmon_output/SARS-CoV-2-rep2.salmon_quant/quant.sf
## SARS-CoV-2-rep3 salmon_output/SARS-CoV-2-rep3.salmon_quant/quant.sf

Converting .sf files to txi list

- txi_tpm: stores TPM with the argument “countsFromAbundance=”lengthScaledTPM"

- txi_counts: stores original counts

- In this project, two txi objects were created with or without the countsFromAbundance=“lengthScaledTPM” argument and compared in downstream DE analysis.

- If you don’t want gene-level summarization, set txOut=TRUE.

- References: tximport doc, DESeq2 doc “Why unnormalized counts?”, Soneson et al. 2016, Developer Dr. Love’s comment, Harvard Chan Bioinformatics Core workshop

# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames

# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    countsFromAbundance="lengthScaledTPM", # Extracts TPM 
                    ignoreTxVersion=T) 

txi_counts <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    ignoreTxVersion=T) 

# tpm 
head(txi_tpm$counts)
##                   Mock-rep1    Mock-rep2 Mock-rep3 SARS-CoV-2-rep1
## ENSG00000000003 9517.232304 7375.7279909 7413.6544     10930.25206
## ENSG00000000005    7.046584    3.9312090    0.0000         2.00487
## ENSG00000000419  878.259656  916.2032201  728.9493      1201.89725
## ENSG00000000457  711.680520  553.4857448  561.2216       777.32713
## ENSG00000000460  465.052498  352.6689833  391.7890       731.41427
## ENSG00000000938    0.000000    0.9944634    0.0000         0.00000
##                 SARS-CoV-2-rep2 SARS-CoV-2-rep3
## ENSG00000000003    6030.5160049     7834.953120
## ENSG00000000005       0.9864265        4.002106
## ENSG00000000419     783.6348859      991.725547
## ENSG00000000457     459.4814539      603.460367
## ENSG00000000460     377.0127208      536.898627
## ENSG00000000938       0.0000000        0.000000
dim(txi_tpm$counts)
## [1] 60240     6
# counts
head(txi_counts$counts)
##                 Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003  8991.942  7660.942  7504.000       11037.869        6193.989
## ENSG00000000005     7.000     4.000     0.000           2.000           1.000
## ENSG00000000419   889.000   928.001   730.000        1211.001         777.000
## ENSG00000000457   699.297   564.374   566.745         832.783         437.987
## ENSG00000000460   452.703   366.157   397.262         740.218         388.013
## ENSG00000000938     0.000     1.000     0.000           0.000           0.000
##                 SARS-CoV-2-rep3
## ENSG00000000003        7772.929
## ENSG00000000005           4.000
## ENSG00000000419         991.000
## ENSG00000000457         596.733
## ENSG00000000460         514.268
## ENSG00000000938           0.000
dim(txi_counts$counts)
## [1] 60240     6

Creating DESeq objects from txi and VST

- Note: The tximport-to-DESeq2 approach uses estimated gene counts from the transcript abundance quantifiers, but not normalized counts.

- The DESeqDataSetFromTximport() function generated an DESeq object (aka dds) with the txi input.

- vst() was run to perform variance stabilizing transformation instead of rlog() which takes longer time with similar characteristics.

- The vsd object created by vst() is used for not DE analysis but QC.

- References: DESeq2 doc “Transcript abundance files”, DESeq2 doc “Variance stabilizing transformation”

# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) { 

    # Create a DESeq object (so-calledd dds) 
    des <- DESeqDataSetFromTximport(txi, 
                                    colData=metadata,
                                    design=~Group)

    # Create a vsd object (so-called vsd) 
    ves <- vst(des, blind=T)

    # Output them as a list 
    return(list(dds=des, vsd=ves))

}

TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']] 
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]

Exploring created dds

# TPM 
TPM[[1]]
## class: DESeqDataSet 
## dim: 60240 6 
## metadata(1): version
## assays(1): counts
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
##   ENSG00000288675
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
##                 Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003      9517      7376      7414           10930            6031
## ENSG00000000005         7         4         0               2               1
## ENSG00000000419       878       916       729            1202             784
## ENSG00000000457       712       553       561             777             459
## ENSG00000000460       465       353       392             731             377
## ENSG00000000938         0         1         0               0               0
##                 SARS-CoV-2-rep3
## ENSG00000000003            7835
## ENSG00000000005               4
## ENSG00000000419             992
## ENSG00000000457             603
## ENSG00000000460             537
## ENSG00000000938               0
# Counts
Counts[[1]]
## class: DESeqDataSet 
## dim: 60240 6 
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
##   ENSG00000288675
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
##                 Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003      8992      7661      7504           11038            6194
## ENSG00000000005         7         4         0               2               1
## ENSG00000000419       889       928       730            1211             777
## ENSG00000000457       699       564       567             833             438
## ENSG00000000460       453       366       397             740             388
## ENSG00000000938         0         1         0               0               0
##                 SARS-CoV-2-rep3
## ENSG00000000003            7773
## ENSG00000000005               4
## ENSG00000000419             991
## ENSG00000000457             597
## ENSG00000000460             514
## ENSG00000000938               0

Exploring created vsd

# TPM
TPM[[2]]
## class: DESeqTransform 
## dim: 60240 6 
## metadata(1): version
## assays(1): ''
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
##   ENSG00000288675
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform 
## dim: 60240 6 
## metadata(1): version
## assays(1): ''
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
##   ENSG00000288675
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path

Estimating size factors, dispersions, and conducting Wald Test

- Dispersion is calculated as a measure of variation instead of variance since variance gets larger when gene expression gets higher.

- Wald test is the default setting of DESeq2 which tests null hypothesis between two groups. You should use Likelihood ratio test (LRT) when comparing more than two groups.

- Messages when “Counts <- DESeqPrep_fn(Counts)” was run: using ‘avgTxLength’ from assays(dds)

- References: Harvard Chan Bioinformatics Core workshop I, Harvard Chan Bioinformatics Core workshop II, Harvard Chan Bioinformatics Core workshop III, DESeq2 "Wald test indivisual steps, DESeq2 doc “Likelihood ratio test”

# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
    
    List[[1]] <- estimateSizeFactors(List[[1]])
    List[[1]] <- estimateDispersions(List[[1]])
    List[[1]] <- nbinomWaldTest(List[[1]])
   
    return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts) 
TPM <- DESeqPrep_fn(TPM)

Exploring size factors

sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
##       Mock-rep1       Mock-rep2       Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2 
##       1.0962587       1.0446876       0.8945224       1.3649447       0.7478616 
## SARS-CoV-2-rep3 
##       1.0185737
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead! 
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
##                          Sample    Group                   Path
##                        <factor> <factor>            <character>
## Mock-rep1       Mock-rep1          Mock  salmon_output/Mock-r..
## Mock-rep2       Mock-rep2          Mock  salmon_output/Mock-r..
## Mock-rep3       Mock-rep3          Mock  salmon_output/Mock-r..
## SARS-CoV-2-rep1 SARS-CoV-2-rep1    Covid salmon_output/SARS-C..
## SARS-CoV-2-rep2 SARS-CoV-2-rep2    Covid salmon_output/SARS-C..
## SARS-CoV-2-rep3 SARS-CoV-2-rep3    Covid salmon_output/SARS-C..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
##                          Sample    Group                   Path sizeFactor
##                        <factor> <factor>            <character>  <numeric>
## Mock-rep1       Mock-rep1          Mock  salmon_output/Mock-r..   1.096259
## Mock-rep2       Mock-rep2          Mock  salmon_output/Mock-r..   1.044688
## Mock-rep3       Mock-rep3          Mock  salmon_output/Mock-r..   0.894522
## SARS-CoV-2-rep1 SARS-CoV-2-rep1    Covid salmon_output/SARS-C..   1.364945
## SARS-CoV-2-rep2 SARS-CoV-2-rep2    Covid salmon_output/SARS-C..   0.747862
## SARS-CoV-2-rep3 SARS-CoV-2-rep3    Covid salmon_output/SARS-C..   1.018574

Plotting the size factors in TPM

- The size factors are only available from TPM dds

- Blue dashed line: normalization factor = 1

# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))

colnames(sizeFactor) <- 'Size_Factor'

sizeFactor <- sizeFactor %>%
    rownames_to_column(var="Sample") %>%
    inner_join(metadata[, 1:ncol(metadata)-1], by="Sample") 

sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)



# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample, 
                       y=Size_Factor, 
                       fill=Group,
                       label=Size_Factor)) +
    geom_bar(stat="identity", width=0.8) +
    theme_bw() + 
    ggtitle("Size Factors in TPM-DESeq") +
    geom_text(vjust=1.5) +
    theme(axis.text.x=element_text(angle=45, 
                                   vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")

Plotting nornalization factors in the Counts

- DESeq2 performs an internal normalization where geometric mean is calculated for each gene across all samples. The counts for a gene in each sample is then divided by this mean. The median of these ratios in a sample is the size factor for that sample.

- Blue dashed line: normalization factor = 1

- Colored dots: normlization factors per gene (y-axis) in each sample

- Box plots: distribution of the normalization facters in each group (see the second plot)

- Reference: DESeq2 doc “Sample-/gene-dependent normalization factors”

# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
    gather(Sample, Normalization_Factor) %>%
    inner_join(metadata[, 1:2], by="Sample") 
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors 
normFactors_plot <- ggplot(normf, 
       aes(x=Sample, y=Normalization_Factor)) + 
geom_jitter(alpha=0.5, aes(color=Group)) + 
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor), 
             outlier.shape=NA, alpha=0.5) + 
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45, 
                               vjust=0.5)) + 
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1, 
           color="blue", 
           linetype="dashed")
# Print the normalization factor scatter plot 
print(normFactors_plot)

# Print the same plot with larger y-magnification in order to observe the box plot 
normFactors_plot + 
    ylim(0.5, 1.5)

Setting functions for QC plots

- Reference: DESeq2 doc “Principal component plot of the samples”, DESeq2 doc “Heatmap of the sample-to-sample distances”

# Assigne what to compare
GroupOfInterest <- Contrast[1] 

# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {

    plotPCA(inputList[[2]],    # takes vsd
            intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}

# Set heatmap annotation 
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]

# Set a function for a correlation heatmap 
QCcorrHeatmap_fn <- function(inputList, Title) {

    # Extract transformed count matrix
    mtx <- assay(inputList[[2]])      # takes vsd

    # Calculate correlation and store in the matrix
    mtx <- cor(mtx)
    
    # Create a correlation heatmap
    return(pheatmap(mtx, 
             annotation=HeatmapAnno,
             main=paste("Sample Correlation Heatmap:",
                        Title)))
}

Sample QC: Principal Component Analysis (PCA)

- Checkpoints in PCA: source of variation, sample outlier

grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"), 
             QCPCA_fn(Counts, "QC PCA: Counts"), 
             nrow=2)

Sample QC: Sample Correlation Heatmap

- Checkpoints of correlation heatmap: distance between samples, correlation in a group

- Upper: TPM input

- Lower: Counts input

# TPM
QCcorrHeatmap_fn(TPM, "TPM") 

# Counts
QCcorrHeatmap_fn(Counts, "Counts") 

Running DE analysis with or without shrinkage

- Shrinkage reduces false positives

- Magnitude of shrinkage is affected by dispersion and sample size

- When the argument type is set to “apeglm”, the coef argument is used instead of constrast. In this dataset, you can set coef=Coef where Coef <- resultsNames(ddsList[[1]]).

- When the type is set to “normal”, the argument contrast is set as shown below.

- References: DESeq2 doc “Alternative shrinkage estimators”, Harvard Chan Bioinformatics Core workshop

# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]]) 
names(desList) <- TvC

# Create a list for TPM and Counts dds 
ddsList <- desList  # DE without shrinkage  
normal.ddsList <- desList    # DE with "normal" shrinkage
ape.ddsList <- desList       # DE with "apeglm" shrinkage
ash.ddsList <- desList       # DE with "ashr" shrinkage

for (x in TvC) {
    
    # Run DESeq() 
    ddsList[[x]] <- DESeq(desList[[x]])
    print(resultsNames(ddsList[[x]]))

    normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                                contrast=Contrast,
                                type="normal")

    ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="apeglm")

    ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="ashr")

}
## [1] "Intercept"           "Group_Covid_vs_Mock"
## [1] "Intercept"           "Group_Covid_vs_Mock"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage

Creating dispersion plots

- Dispersion is important since estimation by DESeq2 algorithm is based on the assumption that genes with similar expression levels have similar dispersion. If an RNA-seq dataset doesn’t satisfy this assumption, use other DE algorithms than DESeq2.

- References: DESeq2 doc "Dispersion plot and fitting alternatives, Harvard Chan Bioinformatics Core workshop

# Plot dispersion  
for (x in TvC) {

    plotDispEsts(ddsList[[x]], 
                 ylab="Dispersion", 
                 xlab="Mean of Normalized Counts", 
                 main=paste("Dispersion of", x, "Input"))
}

Extracting DE results with or without shrinkage

- The alpha denotes threshold of false discovery rate (FDR) assigned by users.

- In this analysis, the alpha is set to 0.1

# Set FDR threshold 
alpha=0.1 

# FDR threshold vector
FDRv=c("< 0.1", "> 0.1") 

# Initialize lists of result tables 
all.resList <- all.ddsList 

# Set a function cleaning table
Sig.fn <- function(df, Input) {
    
    df <- df %>% 
        rownames_to_column(var="Gene") %>%
        mutate(FDR=ifelse(padj < 0.1 & !is.na(padj), 
                                   FDRv[1], 
                                   FDRv[2]), 
               Input=Input) 
    return(df)
}




for (i in shrinkage) {

    if (i == "None") {

        for (x in TvC) {

        # Extract data frames from unshrunken lfc & clean data 
        all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]], 
                                                       contrast=Contrast, 
                                                       alpha=alpha)) %>% 
        Sig.fn(x)

         } 
    } else {

        # Extract data frames from shrunken lfc & clean data
        for (x in TvC) {

            all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
                Sig.fn(x)
    

        }
    }
}





# Explore the results 
summary(all.resList)
##        Length Class  Mode
## None   2      -none- list
## Normal 2      -none- list
## Apeglm 2      -none- list
## Ashr   2      -none- list
head(all.resList[[1]][['TPM']])
##              Gene     baseMean log2FoldChange     lfcSE       stat     pvalue
## 1 ENSG00000000003 7965.6928435     0.01606545 0.1265022  0.1269974 0.89894249
## 2 ENSG00000000005    2.8239528     0.61310858 1.5972550  0.3838514 0.70108859
## 3 ENSG00000000419  899.2563435    -0.21797908 0.1439985 -1.5137592 0.13008694
## 4 ENSG00000000457  596.8308344     0.02584059 0.1578318  0.1637223 0.86994974
## 5 ENSG00000000460  461.1928898    -0.38598899 0.1724779 -2.2379038 0.02522733
## 6 ENSG00000000938    0.1595373     0.96892188 4.0804729  0.2374533 0.81230512
##        padj   FDR Input
## 1 0.9648660 > 0.1   TPM
## 2        NA > 0.1   TPM
## 3 0.3742990 > 0.1   TPM
## 4 0.9545403 > 0.1   TPM
## 5 0.1284758 > 0.1   TPM
## 6        NA > 0.1   TPM
head(all.resList[[1]][['Counts']])
##              Gene     baseMean log2FoldChange     lfcSE       stat     pvalue
## 1 ENSG00000000003 8079.0801619     0.01540008 0.1269664  0.1212925 0.90345933
## 2 ENSG00000000005    2.8594904     0.61234899 1.5811720  0.3872754 0.69855235
## 3 ENSG00000000419  912.4765014    -0.21785565 0.1443733 -1.5089747 0.13130525
## 4 ENSG00000000457  605.6119743     0.02391363 0.1576831  0.1516563 0.87945799
## 5 ENSG00000000460  467.6415182    -0.38743010 0.1721107 -2.2510517 0.02438226
## 6 ENSG00000000938    0.1610882     0.96751347 4.0804729  0.2371082 0.81257287
##        padj   FDR  Input
## 1 0.9671957 > 0.1 Counts
## 2        NA > 0.1 Counts
## 3 0.3771258 > 0.1 Counts
## 4 0.9576948 > 0.1 Counts
## 5 0.1253609 > 0.1 Counts
## 6        NA > 0.1 Counts
head(all.resList[[2]][['TPM']])
##              Gene     baseMean log2FoldChange     lfcSE       stat     pvalue
## 1 ENSG00000000003 7965.6928435     0.01566841 0.1233752  0.1269974 0.89894249
## 2 ENSG00000000005    2.8239528     0.12175895 0.3175826  0.3838514 0.70108859
## 3 ENSG00000000419  899.2563435    -0.21104412 0.1394188 -1.5137592 0.13008694
## 4 ENSG00000000457  596.8308344     0.02485951 0.1518409  0.1637223 0.86994974
## 5 ENSG00000000460  461.1928898    -0.36862111 0.1647104 -2.2379038 0.02522733
## 6 ENSG00000000938    0.1595373     0.03539765 0.1490746  0.2374533 0.81230512
##        padj   FDR Input
## 1 0.9648660 > 0.1   TPM
## 2        NA > 0.1   TPM
## 3 0.3742990 > 0.1   TPM
## 4 0.9545403 > 0.1   TPM
## 5 0.1284758 > 0.1   TPM
## 6        NA > 0.1   TPM
head(all.resList[[2]][['Counts']])
##              Gene     baseMean log2FoldChange     lfcSE       stat     pvalue
## 1 ENSG00000000003 8079.0801619     0.01501330 0.1237769  0.1212925 0.90345933
## 2 ENSG00000000005    2.8594904     0.12268803 0.3171860  0.3872754 0.69855235
## 3 ENSG00000000419  912.4765014    -0.21082731 0.1397174 -1.5089747 0.13130525
## 4 ENSG00000000457  605.6119743     0.02299923 0.1516556  0.1516563 0.87945799
## 5 ENSG00000000460  467.6415182    -0.36991324 0.1643234 -2.2510517 0.02438226
## 6 ENSG00000000938    0.1610882     0.03503450 0.1477600  0.2371082 0.81257287
##        padj   FDR  Input
## 1 0.9671957 > 0.1 Counts
## 2        NA > 0.1 Counts
## 3 0.3771258 > 0.1 Counts
## 4 0.9576948 > 0.1 Counts
## 5 0.1253609 > 0.1 Counts
## 6        NA > 0.1 Counts

Exploring mean-difference relationship with MA plots

- x-axis: expression level (baseMean))

- y-axis: fold change (log2FoldChange)

- Red dashed lines: log2FoldChange = -1 and 1

- Reference: DESeq2 doc “MA-plot”

# Set ylim: has to adjusted by users depending on data 
yl <- c(-25, 25)

# Set min log2 fold change of interest 
mLog <- c(-1, 1)

# Initialize a list storing MA plots
MAList <- ddsList


# Create MA plots

for (i in shrinkage) {

    both.df <- rbind(all.resList[[i]][[TvC[1]]], 
                     all.resList[[i]][[TvC[2]]])

    MAList[[i]] <- ggplot(both.df, 
                              aes(x=baseMean, y=log2FoldChange, color=FDR))  +geom_point() + scale_x_log10() + facet_grid(~Input) + 
                                   theme_bw() + 
                                   scale_color_manual(values=c("blue", "grey")) +
                                   ggtitle(paste("MA plot with", i)) +
                                   ylim(yl[1], yl[2]) + 
                                   geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red") 

}

   

# Print MA plots
MAList
## $TPM
## class: DESeqDataSet 
## dim: 60240 6 
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
##   ENSG00000288675
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
## 
## $Counts
## class: DESeqDataSet 
## dim: 60240 6 
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
##   ENSG00000288675
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
## 
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

Exploring distribution of false discovery rate (FDR)

- Distribution of adjusted p-val (FDR) was presented

- x-axis: FDR

- y-axis: Number of genes

- Black dashed line: FDR = alpha

# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']], 
                 all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)

# Plot distribution of FDR
ggplot(both.df,
       aes(x=padj, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") + 
xlab("Adjusted P-Value (FDR)") + 
ylab("Count") + 
geom_vline(xintercept=alpha, 
           color="black", 
           linetype="dashed",
           size=1) + 
scale_x_continuous(breaks=seq(0, 1, by=0.1)) 

Exploring distribution of log2FoldChange by input type

- Black dashed lines: log2FoldChange = -1 and 1

- x-axis: gene expression level (log2FoldChange)

- y-axis: number of genes

# Initialize a lfc list
lfcplotList <- all.resList 

# Create lfc distribution plots
for (i in shrinkage) {

    lfc.df <- rbind(all.resList[[i]][[TvC[1]]], 
                    all.resList[[i]][[TvC[2]]])

    lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]

    lfc.df$Input <- factor(lfc.df$Input, levels=TvC)

    lfcplotList[[i]] <- ggplot(lfc.df,  # Subset rows with FDR < alpha
                               aes(x=log2FoldChange, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() + ylab("Count") + 
geom_vline(xintercept=c(mLog[1], mLog[2]), 
           color="black", 
           linetype="dashed", 
           size=1) + 
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) + 
xlim(-8, 8)


}

# Print the lfc plots
lfcplotList
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

NA statistics: zero count genes & outlier genes

When NAs appear in

- log2FoldChange: zero counts in all samples

- padj: too little information

- pval & padj: at least one replicate was an outlier

# Count number of NA genes  
type=c("Zero Counts", "Outliers", "Total NA Genes") 


# Create a data frame storing NA gene number
NAstat <- both.df %>%
    group_by(Input) %>%
    summarize(zero=sum(is.na(log2FoldChange)), 
              outlier=sum(is.na(pvalue) & is.na(padj))) %>%
    mutate(total=zero + outlier) %>%
    gather(Type, Number, -Input) %>%
    mutate(Type=factor(case_when(Type == "zero" ~ type[1], 
                                 Type == "outlier" ~ type[2], 
                                 Type == "total" ~ type[3]), 
                       levels=type))

# Plot number of NA genes 
ggplot(NAstat, 
       aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) + 
    geom_bar(stat="identity", position="dodge") + 
    theme_bw() +
    geom_text(position=position_dodge(width=1), vjust=1.5) + 
    ggtitle("Number of NA Genes") + 
    ylab("Number of Genes")

Ranking DEGs with the TPM and original count inputs

- fdr.rank: ranked by FDR

- lfc.rank: ranked by absolute fold change

- up.lfc.rank: ranked by magnitude of fold change increase

- down.lfc.rank: ranked by manitude of fold change decrease

# Create a new list having DE table with FDR below alpha
fdr.rank <- all.resList
lfc.rank <- all.resList
up.lfc.rank <- all.resList
down.lfc.rank <- all.resList

# Set a function cleaning a data frame 
filter.fdr.fn <- function(df) {as.data.table(df[df$FDR == FDRv[1],])}

# Set a function creating a column for the rank
Ranking.fn <- function(x) {mutate(x, Rank=1:nrow(x))}


for (i in shrinkage) {

    for (x in TvC) {

        rankdf <- all.resList[[i]][[x]]

        fdr.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(padj) %>% Ranking.fn() 

        lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(desc(abs(log2FoldChange))) %>% Ranking.fn()

        up.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% 
            arrange(desc(log2FoldChange)) %>% 
            Ranking.fn()

        down.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
            arrange(log2FoldChange) %>%
            Ranking.fn()

    }
}

# Explore the ranking outputs
head(fdr.rank[[1]][[1]])
##               Gene  baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSG00000169908  2254.333      -4.059309 0.1960933 -20.70091 3.398917e-95
## 2: ENSG00000261008  1643.843      -4.028891 0.2035790 -19.79031 3.608319e-87
## 3: ENSG00000118523  3796.333      -3.494708 0.2090989 -16.71318 1.050769e-62
## 4: ENSG00000136943 11206.252      -3.090493 0.1969195 -15.69420 1.657303e-55
## 5: ENSG00000181104  2467.646      -1.962023 0.1308756 -14.99150 8.344167e-51
## 6: ENSG00000038427  6589.433      -2.228463 0.1488375 -14.97246 1.111298e-50
##            padj   FDR Input Rank
## 1: 6.328443e-91 < 0.1   TPM    1
## 2: 3.359165e-83 < 0.1   TPM    2
## 3: 6.521422e-59 < 0.1   TPM    3
## 4: 7.714333e-52 < 0.1   TPM    4
## 5: 3.107201e-47 < 0.1   TPM    5
## 6: 3.448543e-47 < 0.1   TPM    6
head(fdr.rank[[1]][[2]])
##               Gene  baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSG00000169908  2288.067      -4.059312 0.1950583 -20.81076 3.458154e-96
## 2: ENSG00000261008  1635.608      -4.023630 0.2035590 -19.76641 5.795623e-87
## 3: ENSG00000118523  3853.040      -3.494245 0.2080393 -16.79608 2.606997e-63
## 4: ENSG00000136943 11365.707      -3.090702 0.1961564 -15.75631 6.215816e-56
## 5: ENSG00000038427  6677.623      -2.228988 0.1487473 -14.98506 9.194038e-51
## 6: ENSG00000181104  2504.476      -1.962507 0.1315082 -14.92307 2.332836e-50
##            padj   FDR  Input Rank
## 1: 6.447037e-92 < 0.1 Counts    1
## 2: 5.402390e-83 < 0.1 Counts    2
## 3: 1.620075e-59 < 0.1 Counts    3
## 4: 2.897037e-52 < 0.1 Counts    4
## 5: 3.428089e-47 < 0.1 Counts    5
## 6: 7.248511e-47 < 0.1 Counts    6
head(lfc.rank[[1]][[1]])
##               Gene  baseMean log2FoldChange    lfcSE      stat       pvalue
## 1: ENSG00000244398 15.377178     -20.124630 3.909696 -5.147364 2.641722e-07
## 2: ENSG00000283809 30.167576       8.377090 1.268057  6.606240 3.942030e-11
## 3: ENSG00000169085 24.416072      -8.054346 1.314483 -6.127388 8.933355e-10
## 4: ENSG00000248167 14.783865      -7.337185 1.355943 -5.411133 6.262727e-08
## 5: ENSG00000263958  8.897781      -6.589416 1.758112 -3.748006 1.782457e-04
## 6: ENSG00000198796 46.727730      -6.571173 1.515131 -4.337033 1.444187e-05
##            padj   FDR Input Rank
## 1: 9.606684e-06 < 0.1   TPM    1
## 2: 3.163649e-09 < 0.1   TPM    2
## 3: 5.525918e-08 < 0.1   TPM    3
## 4: 2.632183e-06 < 0.1   TPM    4
## 5: 2.655005e-03 < 0.1   TPM    5
## 6: 3.185938e-04 < 0.1   TPM    6
head(lfc.rank[[1]][[2]])
##               Gene  baseMean log2FoldChange     lfcSE       stat       pvalue
## 1: ENSG00000244398  15.65189     -20.764231 3.9096818  -5.310977 1.090390e-07
## 2: ENSG00000283809  30.69086       8.386429 1.2655524   6.626694 3.432871e-11
## 3: ENSG00000169085  22.92788      -7.974919 1.3156842  -6.061423 1.349221e-09
## 4: ENSG00000198796  45.23995      -7.672070 1.3269181  -5.781871 7.387415e-09
## 5: ENSG00000248167  14.92215      -7.336534 1.3505819  -5.432128 5.568592e-08
## 6: ENSG00000122641 129.68367      -5.939531 0.5738227 -10.350813 4.149479e-25
##            padj   FDR  Input Rank
## 1: 4.306810e-06 < 0.1 Counts    1
## 2: 2.746739e-09 < 0.1 Counts    2
## 3: 8.114042e-08 < 0.1 Counts    3
## 4: 3.794038e-07 < 0.1 Counts    4
## 5: 2.354087e-06 < 0.1 Counts    5
## 6: 2.035756e-22 < 0.1 Counts    6
head(up.lfc.rank[[1]][[1]])
##               Gene  baseMean log2FoldChange     lfcSE     stat       pvalue
## 1: ENSG00000283809 30.167576       8.377090 1.2680572 6.606240 3.942030e-11
## 2: ENSG00000271723 19.774915       5.193793 1.0422077 4.983453 6.245960e-07
## 3: ENSG00000227373 41.205040       4.620244 1.3222441 3.494245 4.754050e-04
## 4: ENSG00000265168 17.724166       4.561790 0.9740951 4.683105 2.825613e-06
## 5: ENSG00000283698  9.252657       3.924808 1.6104430 2.437098 1.480565e-02
## 6: ENSG00000279750  9.870121       3.817917 1.1405856 3.347331 8.159385e-04
##            padj   FDR Input Rank
## 1: 3.163649e-09 < 0.1   TPM    1
## 2: 2.072968e-05 < 0.1   TPM    2
## 3: 5.976749e-03 < 0.1   TPM    3
## 4: 7.691535e-05 < 0.1   TPM    4
## 5: 8.725174e-02 < 0.1   TPM    5
## 6: 9.218422e-03 < 0.1   TPM    6
head(up.lfc.rank[[1]][[2]])
##               Gene  baseMean log2FoldChange     lfcSE     stat       pvalue
## 1: ENSG00000283809 30.690860       8.386429 1.2655524 6.626694 3.432871e-11
## 2: ENSG00000227373 26.791564       4.849108 0.8326809 5.823489 5.763157e-09
## 3: ENSG00000271723 20.185952       4.762474 0.9708544 4.905446 9.321512e-07
## 4: ENSG00000265168 17.958506       4.569418 0.9644351 4.737922 2.159209e-06
## 5: ENSG00000279750  9.980378       3.812218 1.1296849 3.374585 7.392702e-04
## 6: ENSG00000251177  9.063109       3.763735 1.0759497 3.498059 4.686582e-04
##            padj   FDR  Input Rank
## 1: 2.746739e-09 < 0.1 Counts    1
## 2: 3.043698e-07 < 0.1 Counts    2
## 3: 2.867770e-05 < 0.1 Counts    3
## 4: 6.008079e-05 < 0.1 Counts    4
## 5: 8.540299e-03 < 0.1 Counts    5
## 6: 5.895543e-03 < 0.1 Counts    6
head(down.lfc.rank[[1]][[1]])
##               Gene  baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSG00000244398 15.377178     -20.124630 3.9096963 -5.147364 2.641722e-07
## 2: ENSG00000169085 24.416072      -8.054346 1.3144829 -6.127388 8.933355e-10
## 3: ENSG00000248167 14.783865      -7.337185 1.3559425 -5.411133 6.262727e-08
## 4: ENSG00000263958  8.897781      -6.589416 1.7581122 -3.748006 1.782457e-04
## 5: ENSG00000198796 46.727730      -6.571173 1.5151307 -4.337033 1.444187e-05
## 6: ENSG00000205277 31.406357      -5.965202 0.9969242 -5.983606 2.182508e-09
##            padj   FDR Input Rank
## 1: 9.606684e-06 < 0.1   TPM    1
## 2: 5.525918e-08 < 0.1   TPM    2
## 3: 2.632183e-06 < 0.1   TPM    3
## 4: 2.655005e-03 < 0.1   TPM    4
## 5: 3.185938e-04 < 0.1   TPM    5
## 6: 1.269879e-07 < 0.1   TPM    6
head(down.lfc.rank[[1]][[2]])
##               Gene  baseMean log2FoldChange     lfcSE       stat       pvalue
## 1: ENSG00000244398  15.65189     -20.764231 3.9096818  -5.310977 1.090390e-07
## 2: ENSG00000169085  22.92788      -7.974919 1.3156842  -6.061423 1.349221e-09
## 3: ENSG00000198796  45.23995      -7.672070 1.3269181  -5.781871 7.387415e-09
## 4: ENSG00000248167  14.92215      -7.336534 1.3505819  -5.432128 5.568592e-08
## 5: ENSG00000122641 129.68367      -5.939531 0.5738227 -10.350813 4.149479e-25
## 6: ENSG00000205277  19.72852      -5.935479 0.9101660  -6.521315 6.969362e-11
##            padj   FDR  Input Rank
## 1: 4.306810e-06 < 0.1 Counts    1
## 2: 8.114042e-08 < 0.1 Counts    2
## 3: 3.794038e-07 < 0.1 Counts    3
## 4: 2.354087e-06 < 0.1 Counts    4
## 5: 2.035756e-22 < 0.1 Counts    5
## 6: 5.391278e-09 < 0.1 Counts    6

Calculating rank difference

# Set a function rebuilding DE tables with gene ranks 
rankdiff.fn <- function(List){

    # Select columns and join the data frames by gene
    full_join(List[[TvC[1]]][, .(Gene, Input, Rank, baseMean)], 
              List[[TvC[2]]][, .(Gene, Input, Rank, baseMean)], 
              by="Gene") %>%
    
    # Add columns assining gene expression levels, rank differences, and mean ranks
    mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
           RankDiff=Rank.x-Rank.y, 
           MeanRank=(Rank.x+Rank.y)/2)
} 

# Set a function adding Shrinkage column 
add.shr.fn <- function(df, shr) {mutate(df, Shrinkage=shr)} 

# Set a function rbinding multiple data frames 
rbinds.fn <- function(List) {rbind(List[[1]], 
                                   List[[2]], 
                                   List[[3]], 
                                   List[[4]])}



# Calculate and plot rank difference
for (i in shrinkage) {

    # Calculate rank difference
    fdr.rank[[i]] <- rankdiff.fn(fdr.rank[[i]]) %>% add.shr.fn(i)
    lfc.rank[[i]] <- rankdiff.fn(lfc.rank[[i]]) %>% add.shr.fn(i)
    up.lfc.rank[[i]] <- rankdiff.fn(up.lfc.rank[[i]]) %>% add.shr.fn(i)
    down.lfc.rank[[i]] <- rankdiff.fn(down.lfc.rank[[i]]) %>% add.shr.fn(i) 

}

# Create combined data frames across the shrinkages 
fdr.rank.df <- rbinds.fn(fdr.rank) 
lfc.rank.df <- rbinds.fn(lfc.rank)
up.lfc.rank.df <- rbinds.fn(up.lfc.rank)
down.lfc.rank.df <- rbinds.fn(down.lfc.rank)



# Explore the ranking outputs
head(fdr.rank.df)
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000169908     TPM      1   2254.333  Counts      1   2288.067
## 2: ENSG00000261008     TPM      2   1643.843  Counts      2   1635.608
## 3: ENSG00000118523     TPM      3   3796.333  Counts      3   3853.040
## 4: ENSG00000136943     TPM      4  11206.252  Counts      4  11365.707
## 5: ENSG00000181104     TPM      5   2467.646  Counts      6   2504.476
## 6: ENSG00000038427     TPM      6   6589.433  Counts      5   6677.623
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          8.131050        0      1.0      None
## 2:          7.808586        0      2.0      None
## 3:          8.652223        0      3.0      None
## 4:          9.734424        0      4.0      None
## 5:          8.221448       -1      5.5      None
## 6:          9.203139        1      5.5      None
head(lfc.rank.df)
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398     TPM      1  15.377178  Counts      1   15.65189
## 2: ENSG00000283809     TPM      2  30.167576  Counts      2   30.69086
## 3: ENSG00000169085     TPM      3  24.416072  Counts      3   22.92788
## 4: ENSG00000248167     TPM      4  14.783865  Counts      5   14.92215
## 5: ENSG00000263958     TPM      5   8.897781    <NA>     NA         NA
## 6: ENSG00000198796     TPM      6  46.727730  Counts      4   45.23995
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          3.144287        0      1.0      None
## 2:          3.817998        0      2.0      None
## 3:          3.580180        0      3.0      None
## 4:          3.102114       -1      4.5      None
## 5:                NA       NA       NA      None
## 6:          4.239133        2      5.0      None
head(up.lfc.rank.df)
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000283809     TPM      1  30.167576  Counts      1  30.690860
## 2: ENSG00000271723     TPM      2  19.774915  Counts      3  20.185952
## 3: ENSG00000227373     TPM      3  41.205040  Counts      2  26.791564
## 4: ENSG00000265168     TPM      4  17.724166  Counts      4  17.958506
## 5: ENSG00000283698     TPM      5   9.252657    <NA>     NA         NA
## 6: ENSG00000279750     TPM      6   9.870121  Counts      5   9.980378
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          3.817998        0      1.0      None
## 2:          3.396784       -1      2.5      None
## 3:          4.000049        1      2.5      None
## 4:          3.284792        0      4.0      None
## 5:                NA       NA       NA      None
## 6:          2.698694        1      5.5      None
head(down.lfc.rank.df)
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398     TPM      1  15.377178  Counts      1   15.65189
## 2: ENSG00000169085     TPM      2  24.416072  Counts      2   22.92788
## 3: ENSG00000248167     TPM      3  14.783865  Counts      4   14.92215
## 4: ENSG00000263958     TPM      4   8.897781    <NA>     NA         NA
## 5: ENSG00000198796     TPM      5  46.727730  Counts      3   45.23995
## 6: ENSG00000205277     TPM      6  31.406357  Counts      6   19.72852
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          3.144287        0      1.0      None
## 2:          3.580180        0      2.0      None
## 3:          3.102114       -1      3.5      None
## 4:                NA       NA       NA      None
## 5:          4.239133        2      4.0      None
## 6:          3.720151        0      6.0      None

Visualizing DEG ranks I: TPM- vs Counts-input

- x-axis: rank with TPM input

- y-axis: rank with Counts input

- Black diagonal lines: rank with TPM = rank with Counts

- Dot color: gene expression level (log-baseMean)

# Set a function plotting gene ranks between TPM- (x-axis) and Counts-Inputs (y-axis)
ranking.plot.fn <- function(df, rankedby) {

    df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)

    ggplot(df, 
           aes(x=Rank.x, y=Rank.y, color=logMeanExpression)) + geom_point(alpha=0.5) + facet_grid(~Shrinkage) + theme_bw() + theme(strip.text.x=element_text(size=10)) + xlab("Rank with TPM") + ylab("Rank with Counts") + geom_abline(slope=1, color="black", size=0.5) + ggtitle(paste(rankedby, "Ranking with TPM vs Count Inputs")) + scale_color_gradient(low="blue", high="red") 
}

# Print output plots
ranking.plot.fn(fdr.rank.df, "FDR")

ranking.plot.fn(lfc.rank.df, "Log2FoldChange")

ranking.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)")

ranking.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)")

Visualizing DEG ranks II: Rank difference

- x-axis: expression level (log-baseMean)

- y-axis: rank difference (rank with TPM - rank with Counts)

- Black horizontal lines: rank with TPM = rank with Counts

- Dot color: average rank

# Set a function plotting the rank difference over the gene expression level
rankdiff.plot.fn <- function(df, rankedby, ylim) {

    df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)

    ggplot(df, aes(x=logMeanExpression, y=RankDiff, color=MeanRank)) + 
        geom_point(alpha=0.5) + 
        theme_bw() + 
        facet_grid(~Shrinkage) + 
        theme(strip.text.x=element_text(size=10)) + 
        ylab("Rank Difference (TPM - Count)") + 
        ggtitle(paste("Rank Difference (TPM - Count):", rankedby)) + 
        geom_hline(yintercept=0, color="black", size=0.5) + scale_color_gradient(low="blue", high="red") 
}

# Print output plots
rankdiff.plot.fn(fdr.rank.df, "FDR", 100)

rankdiff.plot.fn(lfc.rank.df, "Log2FoldChange", 120)

rankdiff.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)", 100)

rankdiff.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)", 75)

Distribution of rank difference in unshrunken data

- y-axis: abs(TPM-Count inputs)

- x-axis: FDR or log2FoldChange (Increase/Decrease)

# Set a function subsetting unshrunken data
unshr.rankdiff.fn <- function(df) {subset(df, Shrinkage == "None")}

# Create a list storing rankdiff data frames 
rankList <- list(unshr.rankdiff.fn(fdr.rank.df),
                 unshr.rankdiff.fn(lfc.rank.df),
                 unshr.rankdiff.fn(up.lfc.rank.df),
                 unshr.rankdiff.fn(down.lfc.rank.df))

# Assine result column as a factor with levels 
rankdiff.levels <- c("FDR", 
                     "log2FoldChange", 
                     "log2FoldChange.Increase", 
                     "log2FoldChange.Decrease")


# Name the list 
names(rankList) <- rankdiff.levels

# Create a new data frame storing rank difference by result type
rankdiff.dist <- data.frame(FDR=abs(rankList[[1]]$RankDiff), 
                            log2FoldChange=abs(rankList[[2]]$RankDiff),
                            log2FoldChange.Increase=abs(rankList[[3]]$RankDiff),
                            log2FoldChange.Decrease=abs(rankList[[4]]$RankDiff)) %>% gather(Result, RankDiff) 

rankdiff.dist$Result <- factor(rankdiff.dist$Result, levels=rankdiff.levels)

# Plot distribution of absolute rank difference
ggplot(rankdiff.dist,
       aes(x=Result, y=RankDiff, color=Result)) +
geom_jitter(alpha=0.5) + 
geom_boxplot(alpha=0.5, fill="grey", color="black") + 
theme_bw() + 
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("Distribution of Absolute Rank Difference without Shrinkage \n(TPM - Count Input)") + 
ylab("Absolute Rank Difference (TPM - Count Input)") 

Relationship between rank difference and number of transcript versions

- y-axis: abs(TPM-Count inputs)

- x-axis: number of transcripts (number of transcript id / gene id)

- dot color: mean rank

- 16 genes were missing in the plots

# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>% 
    group_by(GENEID) %>% 
    summarize(num.trans=n_distinct(TXID))

# Set a function adding the number of transcripts by gene id 
add.ntrans.fn <- function(df) {

    left_join(df, AnnoDb.ntrans, by=c("Gene"="GENEID"))}




# Add a column indicating the number of transcripts by gene id to every rankdiff data frame
for (x in rankdiff.levels) {

    rankList[[x]] <- add.ntrans.fn(rankList[[x]])
}


# Explore the edited data frames
summary(rankList)
##                         Length Class      Mode
## FDR                     12     data.table list
## log2FoldChange          12     data.table list
## log2FoldChange.Increase 12     data.table list
## log2FoldChange.Decrease 12     data.table list
head(rankList[[1]])
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000169908     TPM      1   2254.333  Counts      1   2288.067
## 2: ENSG00000261008     TPM      2   1643.843  Counts      2   1635.608
## 3: ENSG00000118523     TPM      3   3796.333  Counts      3   3853.040
## 4: ENSG00000136943     TPM      4  11206.252  Counts      4  11365.707
## 5: ENSG00000181104     TPM      5   2467.646  Counts      6   2504.476
## 6: ENSG00000038427     TPM      6   6589.433  Counts      5   6677.623
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          8.131050        0      1.0      None         4
## 2:          7.808586        0      2.0      None        18
## 3:          8.652223        0      3.0      None         1
## 4:          9.734424        0      4.0      None         3
## 5:          8.221448       -1      5.5      None         2
## 6:          9.203139        1      5.5      None        12
head(rankList[[2]])
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398     TPM      1  15.377178  Counts      1   15.65189
## 2: ENSG00000283809     TPM      2  30.167576  Counts      2   30.69086
## 3: ENSG00000169085     TPM      3  24.416072  Counts      3   22.92788
## 4: ENSG00000248167     TPM      4  14.783865  Counts      5   14.92215
## 5: ENSG00000263958     TPM      5   8.897781    <NA>     NA         NA
## 6: ENSG00000198796     TPM      6  46.727730  Counts      4   45.23995
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          3.144287        0      1.0      None         1
## 2:          3.817998        0      2.0      None         1
## 3:          3.580180        0      3.0      None        10
## 4:          3.102114       -1      4.5      None         1
## 5:                NA       NA       NA      None         3
## 6:          4.239133        2      5.0      None         6
head(rankList[[3]])
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000283809     TPM      1  30.167576  Counts      1  30.690860
## 2: ENSG00000271723     TPM      2  19.774915  Counts      3  20.185952
## 3: ENSG00000227373     TPM      3  41.205040  Counts      2  26.791564
## 4: ENSG00000265168     TPM      4  17.724166  Counts      4  17.958506
## 5: ENSG00000283698     TPM      5   9.252657    <NA>     NA         NA
## 6: ENSG00000279750     TPM      6   9.870121  Counts      5   9.980378
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          3.817998        0      1.0      None         1
## 2:          3.396784       -1      2.5      None         4
## 3:          4.000049        1      2.5      None         7
## 4:          3.284792        0      4.0      None         1
## 5:                NA       NA       NA      None         2
## 6:          2.698694        1      5.5      None         2
head(rankList[[4]])
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398     TPM      1  15.377178  Counts      1   15.65189
## 2: ENSG00000169085     TPM      2  24.416072  Counts      2   22.92788
## 3: ENSG00000248167     TPM      3  14.783865  Counts      4   14.92215
## 4: ENSG00000263958     TPM      4   8.897781    <NA>     NA         NA
## 5: ENSG00000198796     TPM      5  46.727730  Counts      3   45.23995
## 6: ENSG00000205277     TPM      6  31.406357  Counts      6   19.72852
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          3.144287        0      1.0      None         1
## 2:          3.580180        0      2.0      None        10
## 3:          3.102114       -1      3.5      None         1
## 4:                NA       NA       NA      None         3
## 5:          4.239133        2      4.0      None         6
## 6:          3.720151        0      6.0      None         6
# Set a function plotting rank difference vs number of transcripts 
rank.ntrans.plot.fn <- function(df, title) {

    ggplot(df, aes(x=num.trans, y=abs(RankDiff), color=MeanRank)) + 
        geom_jitter(alpha=0.5) + 
        theme_bw() + 
        ggtitle(paste("Rank Difference vs Number of Alternative Transcripts \nin", title)) + 
        xlab("Number of Alternative Transcripts") + 
        ylab("Absolute Rank Difference \n(TPM - Counts)") + scale_color_gradient(low="blue", high="red") 
}

# Print the plots
rank.ntrans.plot.fn(rankList[[1]], "FDR")

rank.ntrans.plot.fn(rankList[[2]], "log2FoldChange")

rank.ntrans.plot.fn(rankList[[3]], "log2FoldChange (Increase)")

rank.ntrans.plot.fn(rankList[[4]], "log2FoldChange (Decrease)")

Finding genes having large difference in rankings

# Initialize a list storing rankdiff genes 
large.rankdiff <- rankList

# Assign a vector storing minimum (thresholds) rankdiff for filtering large rankdiff genes
rankdiff.thr <- c(10, 10, 10, 10)

names(rankdiff.thr) <- rankdiff.levels

for (x in rankdiff.levels) {

    # Filter out observations below the rankdiff thresholds
    large.rankdiff[[x]] <- subset(rankList[[x]], 
                                      abs(RankDiff) > rankdiff.thr[x]) 

}

# Explore the filtered genes 
summary(large.rankdiff)
##                         Length Class      Mode
## FDR                     12     data.table list
## log2FoldChange          12     data.table list
## log2FoldChange.Increase 12     data.table list
## log2FoldChange.Decrease 12     data.table list
dim(large.rankdiff[[rankdiff.levels[1]]])
## [1] 1968   12
dim(large.rankdiff[[rankdiff.levels[2]]])
## [1] 1330   12
dim(large.rankdiff[[rankdiff.levels[3]]])
## [1] 1036   12
dim(large.rankdiff[[rankdiff.levels[4]]])
## [1] 846  12
# Set a function saving rankdiff.csv files
saving.fn <- function(input.df, data.type) { 

    filename <- paste0("tpmVScount_", data.type, "_RankDiff.csv")

    return(write.csv(input.df, filename))
}



# Save the filtered and arranged data frames as csv files
saving.fn(large.rankdiff[[rankdiff.levels[1]]], "FDR")
saving.fn(large.rankdiff[[rankdiff.levels[2]]], "LFC")
saving.fn(large.rankdiff[[rankdiff.levels[3]]], "LFC_Increase")
saving.fn(large.rankdiff[[rankdiff.levels[4]]], "LFC_Decrease")

Summarizing up/down DEGs with an upset plot

- red bar: input type

- blue bar: directionality of gene expression change

# Set a function cleaning data to generate upset plots 
upset.input.fn <- function(df) {

    # Filter genes with valid padj
    df <- subset(df, !is.na(padj)) %>% 

        
        mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes? 
               
               Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""),  # What are downregulated genes? 
               
               Unchanged=ifelse(FDR == FDRv[2], Gene, ""),   # What are unchanged genes? 
               
               TPM_Input=ifelse(Input == "TPM", Gene, ""),   # What are the genes from TPM input? 
               
               Counts_Input=ifelse(Input == "Counts", Gene, ""))   # What are the genes from Counts input? 

    # Create a list storing groups of interest
    upsetInput <- list(Up=df$Up, 
                       Down=df$Down, 
                       Unchanged=df$Unchanged, 
                       TPM_Input=df$TPM, 
                       Counts_Input=df$Counts) 

    return(upsetInput)

}

upsetList <- upset.input.fn(both.df)


# Create the upset plot 
upset(fromList(upsetList), 
      sets=names(upsetList),   # What group to display 
      sets.x.label="Number of Genes per Group",
      order.by="freq",
      point.size=3,
      sets.bar.color=c("red", "red","blue", "blue", "blue"),
      text.scale = 1.5, number.angles=30) 

Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/r/lib/libopenblasp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ashr_2.2-47                 apeglm_1.12.0              
##  [3] ensembldb_2.14.0            AnnotationFilter_1.14.0    
##  [5] GenomicFeatures_1.42.1      AnnotationDbi_1.52.0       
##  [7] UpSetR_1.4.0                gridExtra_2.3              
##  [9] pheatmap_1.0.12             DESeq2_1.30.0              
## [11] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [13] MatrixGenerics_1.2.0        matrixStats_0.57.0         
## [15] GenomicRanges_1.42.0        GenomeInfoDb_1.26.2        
## [17] IRanges_2.24.0              S4Vectors_0.28.1           
## [19] tximport_1.18.0             forcats_0.5.0              
## [21] stringr_1.4.0               dplyr_1.0.2                
## [23] purrr_0.3.4                 readr_1.4.0                
## [25] tidyr_1.1.2                 tibble_3.0.4               
## [27] ggplot2_3.3.2               tidyverse_1.3.0            
## [29] AnnotationHub_2.22.0        BiocFileCache_1.14.0       
## [31] dbplyr_2.0.0                BiocGenerics_0.36.0        
## [33] rmarkdown_2.5               data.table_1.13.4          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                  backports_1.2.1              
##   [3] plyr_1.8.6                    lazyeval_0.2.2               
##   [5] splines_4.0.3                 BiocParallel_1.24.1          
##   [7] digest_0.6.27                 invgamma_1.1                 
##   [9] htmltools_0.5.0               SQUAREM_2020.5               
##  [11] fansi_0.4.1                   magrittr_2.0.1               
##  [13] memoise_1.1.0                 Biostrings_2.58.0            
##  [15] annotate_1.68.0               modelr_0.1.8                 
##  [17] askpass_1.1                   bdsmatrix_1.3-4              
##  [19] prettyunits_1.1.1             colorspace_2.0-0             
##  [21] blob_1.2.1                    rvest_0.3.6                  
##  [23] rappdirs_0.3.1                haven_2.3.1                  
##  [25] xfun_0.19                     crayon_1.3.4                 
##  [27] RCurl_1.98-1.2                jsonlite_1.7.2               
##  [29] genefilter_1.72.0             survival_3.2-7               
##  [31] glue_1.4.2                    gtable_0.3.0                 
##  [33] zlibbioc_1.36.0               XVector_0.30.0               
##  [35] DelayedArray_0.16.0           scales_1.1.1                 
##  [37] mvtnorm_1.1-1                 DBI_1.1.0                    
##  [39] Rcpp_1.0.5                    xtable_1.8-4                 
##  [41] progress_1.2.2                emdbook_1.3.12               
##  [43] bit_4.0.4                     truncnorm_1.0-8              
##  [45] httr_1.4.2                    RColorBrewer_1.1-2           
##  [47] ellipsis_0.3.1                farver_2.0.3                 
##  [49] pkgconfig_2.0.3               XML_3.99-0.5                 
##  [51] utf8_1.1.4                    locfit_1.5-9.4               
##  [53] labeling_0.4.2                tidyselect_1.1.0             
##  [55] rlang_0.4.9                   later_1.1.0.1                
##  [57] munsell_0.5.0                 BiocVersion_3.12.0           
##  [59] cellranger_1.1.0              tools_4.0.3                  
##  [61] cli_2.2.0                     generics_0.1.0               
##  [63] RSQLite_2.2.1                 broom_0.7.2                  
##  [65] evaluate_0.14                 fastmap_1.0.1                
##  [67] yaml_2.2.1                    knitr_1.30                   
##  [69] bit64_4.0.5                   fs_1.5.0                     
##  [71] mime_0.9                      xml2_1.3.2                   
##  [73] biomaRt_2.46.0                compiler_4.0.3               
##  [75] rstudioapi_0.13               curl_4.3                     
##  [77] interactiveDisplayBase_1.28.0 reprex_0.3.0                 
##  [79] geneplotter_1.68.0            stringi_1.5.3                
##  [81] ps_1.5.0                      lattice_0.20-41              
##  [83] ProtGenerics_1.22.0           Matrix_1.2-18                
##  [85] vctrs_0.3.5                   pillar_1.4.7                 
##  [87] lifecycle_0.2.0               BiocManager_1.30.10          
##  [89] bitops_1.0-6                  irlba_2.3.3                  
##  [91] httpuv_1.5.4                  rtracklayer_1.50.0           
##  [93] R6_2.5.0                      promises_1.1.1               
##  [95] MASS_7.3-53                   assertthat_0.2.1             
##  [97] openssl_1.4.3                 withr_2.3.0                  
##  [99] GenomicAlignments_1.26.0      Rsamtools_2.6.0              
## [101] GenomeInfoDbData_1.2.4        hms_0.5.3                    
## [103] grid_4.0.3                    coda_0.19-4                  
## [105] mixsqp_0.3-43                 bbmle_1.0.23.1               
## [107] numDeriv_2016.8-1.1           shiny_1.5.0                  
## [109] lubridate_1.7.9.2